#-*- coding:utf-8 -*-
__author__ = 'qiang'

import re
import time
import os
import json
import os.path

from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from pymongo.errors import DuplicateKeyError

rootdir = "./data"
reg = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9.]+)"

#uri = "mongodb://user:password@example.com/the_database"
client = MongoClient("mongodb://119.29.165.135:27017/?maxPoolSize=10")
#client.crawler.authenticate('qiang', 'abc123', mechanism='SCRAM-SHA-1')
collection = client["crawler"]["users"]
collection.ensure_index("email", unique=True)

for parent,dirnames,filenames in os.walk(rootdir):
    for filename in filenames:
        print "begin:" + os.path.join(parent,filename)
        r = open(os.path.join(parent,filename))
        num = 0
        for line in r.readlines():
            res = re.match(reg, line)
            if res:
                cur_time = int(time.time())
                email = res.group(1)
                try:
                    collection.insert({
                        "email": email,
                        "times": 0,
                        "status": 0,
                        "update_time": cur_time,
                        "create_time": cur_time
                    })
                    print email
                    num = num + 1
                except DuplicateKeyError as err:
                    print err
        r.close()
        print "end:" + os.path.join(parent,filename) + "count:" + str(num)
